# Computations
import numpy as np
import pandas as pd
# scipy
import scipy.stats as stats
# sklearn
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
# keras
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import SGD
from keras.utils.vis_utils import plot_model
import keras.backend as K
# Visualisation libraries
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
import matplotlib.gridspec as gridspec
import missingno as msno
import plotly.offline as py
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
from plotly.subplots import make_subplots
from wordcloud import WordCloud
import re
# Graphics in retina format
%config InlineBackend.figure_format = 'retina'
# sns setting
sns.set_context("paper", rc={"font.size":12,"axes.titlesize":14,"axes.labelsize":12})
sns.set_style("whitegrid")
# plt setting
plt.style.use('seaborn-whitegrid')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
In this article, we use Kaggle'sPima Indians Diabetes. The Pima indians are a group of Native Americans living in an area consisting of what is now central and southern Arizona. A variety of statistical methods are used here for predictions.
This dataset is originally from the National Institute of Diabetes and Digestive and Kidney Diseases. The objective of the dataset is to diagnostically predict whether or not a patient has diabetes, based on certain diagnostic measurements included in the dataset. Several constraints were placed on the selection of these instances from a larger database. In particular, all patients here are females at least 21 years old of Pima Indian heritage.
The datasets consist of several medical predictor variables and one target variable, Outcome. Predictor variables include the number of pregnancies the patient has had, their BMI, insulin level, age, and so on.
Data = pd.read_csv('pima-indians-diabetes-database/diabetes.csv')
display(Data.head())
print('The Dataset Shape: %i rows and %i columns' % Data.shape)
| Feature | Explanations |
|---|---|
| Pregnancies | Number of times pregnant |
| Glucose | Plasma glucose concentration a 2 hours in an oral glucose tolerance test |
| BloodPressure | Diastolic blood pressure (mm Hg) |
| SkinThickness | Triceps skinfold thickness (mm) |
| Insulin | 2-Hour serum insulin (mu U/ml) |
| BMI | Body mass index (weight in kg/(height in m)^2) |
| DiabetesPedigreeFunction | Diabetes pedigree function |
| Age | Age (years) |
| Outcome | Whether or not a patient has diabetes |
def Data_info(Inp, Only_NaN = False):
Out = pd.DataFrame(Inp.dtypes,columns=['Data Type']).sort_values(by=['Data Type'])
Out = Out.join(pd.DataFrame(Inp.isnull().sum(), columns=['Number of NaN Values']), how='outer')
Out['Percentage'] = np.round(100*(Out['Number of NaN Values']/Inp.shape[0]),2)
if Only_NaN:
Out = Out.loc[Out['Number of NaN Values']>0]
return Out
display(Data_info(Data).T[:2])
_ = msno.bar(Data, figsize=(12,3), fontsize=14, log=False, color="#34495e")
display(Data.describe())
Let's take a close look at our data.
fig, ax = plt.subplots(nrows=4, ncols=2, figsize = (16, 20))
for i in range(len(Data.columns[:-1])):
sns.distplot(Data.iloc[:,i], rug=True, rug_kws={"color": "red"},
kde_kws={"color": "k", "lw": 2, "label": "KDE"},
hist_kws={"histtype": "step", "linewidth": 2,
"alpha": 1, "color": "Navy"}, ax= ax[int(i/2),i%2])
if Data.iloc[:,i].name != 'BMI':
ax[int(i/2),i%2].set_xlabel(re.sub(r"(\w)([A-Z])", r"\1 \2", Data.iloc[:,i].name))
Temp = ['Non-Diabetic' if x==0 else 'Diabetic' for x in Data['Outcome']]
fig = go.Figure(data=go.Splom(dimensions=[dict(label='Pregnancies', values=Data['Pregnancies']),
dict(label='Glucose', values=Data['Glucose']),
dict(label='Blood<br>Pressure', values=Data['BloodPressure']),
dict(label='Skin<br>Thickness', values=Data['SkinThickness']),
dict(label='Insulin', values=Data['Insulin']),
dict(label='BMI', values=Data['BMI']),
dict(label='Diabetes<br>Pedigree<br>Fun', values=Data['DiabetesPedigreeFunction']),
dict(label='Age', values=Data['Age'])],
showupperhalf=False,
marker=dict(color=Data['Outcome'], size=4, colorscale='Bluered',
line=dict(width=0.4, color='black')),
text=Temp, diagonal=dict(visible=False)))
del Temp
fig.update_layout(title='Scatterplot Matrix', dragmode='select',
width=900, height=900, hovermode='closest')
fig.show()
As can be seen, the Data has a normal distribution, and some entries need to be adjusted. In doing so, we defined a normalizer as follows, for a given vector $x$,
\begin{align*} \text{Normalizer}(x, cut) = \begin{cases} x_i &\mbox{if } |x_i- \mu|<\sigma\times cut \\ mode(x) & \mbox{else} \end{cases}. \end{align*}def Normalizer(Col, cut = 3):
return Col[(Col > (Col.mean() - Col.std() * cut)) &
(Col < (Col.mean() + Col.std() * cut))]
fig, ax = plt.subplots(nrows=4, ncols=2, figsize = (16, 20))
Temp = Data.copy()
for i in range(len(Data.columns[:-1])):
Data[Data.columns[i]] = Normalizer(Data[Data.columns[i]])
Data[Data.columns[i]] = Data[Data.columns[i]].fillna(Data[Data.columns[i]].dropna().mode()[0])
# Sub-Plots
sns.distplot(Data.iloc[:,i], rug=True, rug_kws={"color": "red"},
kde_kws={"color": "k", "lw": 2, "label": "KDE"},
hist_kws={"histtype": "step", "linewidth": 2,
"alpha": 1, "color": "Navy"}, ax= ax[int(i/2),i%2])
if Data.iloc[:,i].name != 'BMI':
ax[int(i/2),i%2].set_xlabel(re.sub(r"(\w)([A-Z])", r"\1 \2", Data.iloc[:,i].name))
Basically, we diminished the influence of certain data points (see the following figure).
Temp0 = Temp.copy()
Temp0.iloc[:,:-1] = abs(Data.iloc[:,:-1] - Temp.iloc[:,:-1])
Temp = ['Non-Diabetic' if x==0 else 'Diabetic' for x in Temp0['Outcome']]
fig = go.Figure(data=go.Splom(dimensions=[dict(label='Pregnancies', values=Temp0['Pregnancies']),
dict(label='Glucose', values=Temp0['Glucose']),
dict(label='Blood<br>Pressure', values=Temp0['BloodPressure']),
dict(label='Skin<br>Thickness', values=Temp0['SkinThickness']),
dict(label='Insulin', values=Temp0['Insulin']),
dict(label='BMI', values=Temp0['BMI']),
dict(label='Diabetes<br>Pedigree<br>Fun', values=Temp0['DiabetesPedigreeFunction']),
dict(label='Age', values=Temp0['Age'])],
showupperhalf=False,
marker=dict(color=Temp0['Outcome'], size=4, colorscale='Bluered',
line=dict(width=0.4, color='black')),
text=Temp, diagonal=dict(visible=False)))
del Temp, Temp0
fig.update_layout(title='Scatterplot Matrix', dragmode='select',
width=900, height=900, hovermode='closest')
fig.show()
def Correlation_Plot (Df,Fig_Size):
Correlation_Matrix = Df.corr()
mask = np.zeros_like(Correlation_Matrix)
mask[np.triu_indices_from(mask)] = True
for i in range(len(mask)):
mask[i,i]=0
Fig, ax = plt.subplots(figsize=(Fig_Size,Fig_Size))
sns.heatmap(Correlation_Matrix, ax=ax, mask=mask, annot=True, square=True,
cmap =sns.color_palette("RdYlGn", n_colors=10), linewidths = 0.2, vmin=0, vmax=1, cbar_kws={"shrink": .7})
bottom, top = ax.get_ylim()
Correlation_Plot (Data, 9)
Temp = Data.iloc[:,:-1].var().sort_values(ascending = False).to_frame(name= 'Variance')
display(Temp)
Temp0 = Data.corr()
Temp0.loc[Temp.index[-1]].sort_values().to_frame(name= 'Correlation')[:-1].T
Even though the variance of Diabetes Pedigree Function is low, this might not improve the performance of the model, the correlation of this feature with the reset of features, especially with the Outcome, is noticeable.
Target = 'Outcome'
X = Data.drop(columns = [Target])
y = Data[Target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
pd.DataFrame(data={'Set':['X_train','X_test','y_train','y_test'],
'Shape':[X_train.shape, X_test.shape, y_train.shape, y_test.shape]}).set_index('Set').T
Furthermore, we would like to standardize features by removing the mean and scaling to unit variance.
scaler = StandardScaler()
X_train_STD = scaler.fit_transform(X_train)
X_test_STD = scaler.transform(X_test)
X_train_STD = pd.DataFrame(data = X_train_STD, columns = X_train.columns)
X_test_STD = pd.DataFrame(data = X_test_STD, columns = X_test.columns)
Here, we implement an artificial neural network (ANN) using Keras.
model = Sequential()
model.add(Dense(12, input_dim= X_train_STD.shape[1], init='uniform', activation='relu'))
model.add(Dense(10, init='uniform', activation='relu'))
model.add(Dense(1, init='uniform', activation='relu'))
# Number of iterations
N = int(1e3)
def mean_pred(y_true, y_pred):
return K.mean(y_pred)
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy', mean_pred])
# Train model
history = model.fit(X_train_STD, y_train, nb_epoch= N, batch_size=50, verbose=0)
# Predications and Score
y_pred = model.predict(X_test_STD)
score = model.evaluate(X_test_STD, y_test)
model.summary()
score = pd.DataFrame(score, index = model.metrics_names).T
history = pd.DataFrame(history.history)
display(score.style.hide_index())
fig, ax = plt.subplots(1, 1, figsize=(12, 6))
_ = ax.plot(history['accuracy'], 'navy', label='Accuracy', linewidth=2)
_ = ax.plot(history['loss'], 'red', label='Loss', linewidth=2)
_ = ax.set_ylim(bottom = 0)
_ = ax.set_xlim(left = 0, right = N)
_ = ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0., fontsize = 14)
_ = ax.set_xlabel('Steps', fontsize = 14)
As expected, the accuracy and loss improve as step number increases.
plot_model(model, show_shapes=True, show_layer_names=True, expand_nested = True)